

"""
Goal: Assign a finance sentiment score to each sentences, 
    use the sentence sentiment score and counts to get yearly finance sentiment score for languages
Contact: mjha@gsu.edu (author), manela@wustl.edu, hongyi.liu@wustl.edu
Notes: because of the size of the corpus, we suggest to run one language and one year at a time.
    BERT models, and tokenizers needs to be downloaded once. The code for the same is provided at the end.
"""

# dir
pdir = "" #
bertdir = "bert/"

lnum = 5
yr =  1870

############################### 1. Setup
# import
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pickle

langlist = ["fre", "ger", "ita", "spa", "eng-gb", "eng-us", "rus", "chi-sim"]

tokenlist = ["token_multi.p", "token_multi.p", "token_multi.p", "token_multi.p", "token.p", "token.p", "token_multi.p", "token_chi.p"]
modellist = ["bert_multi.p", "bert_multi.p", "bert_multi.p", "bert_multi.p", "bert.p", "bert.p", "bert_multi.p", "bert_chi.p"]


gblistall = [[("la finance est bonne pour la société", "la finance est mauvaise pour la société"), ("les professionnels de la finance sont surtout bons", "les professionnels de la finance sont surtout mauvais"), ("la finance a un impact positif sur notre monde", "la finance a un impact négatif notre monde"), ("le système financier aide l'économie "," le système financier nuit à l'économie "), (" les services financiers profitent à la société "," les services financiers nuisent à la société ")],
             [("Finanzen sind gut für die Gesellschaft", "Finanzen sind schlecht für die Gesellschaft"), ("Finanzprofis sind meistens gut", "Finanzprofis sind meistens böse"), ("Finanzen wirken sich positiv auf unsere Welt aus", "Finanzen wirken sich negativ aus unsere Welt "), (" Finanzsystem hilft der Wirtschaft "," Finanzsystem schadet der Wirtschaft "), (" Finanzdienstleistungen kommen der Gesellschaft zugute "," Finanzdienstleistungen schaden der Gesellschaft ")],
             [("la finanza fa bene alla società", "la finanza fa male alla società"), ("i professionisti della finanza sono per lo più buoni", "i professionisti della finanza sono principalmente cattivi"), ("la finanza ha un impatto positivo sul nostro mondo", "la finanza ha un impatto negativo il nostro mondo "), (" il sistema finanziario aiuta l'economia "," il sistema finanziario danneggia l'economia "), (" i servizi finanziari avvantaggiano la società "," i servizi finanziari danneggiano la società ")],
             [("las finanzas son buenas para la sociedad", "las finanzas son malas para la sociedad"), ("los profesionales financieros son en su mayoría buenos", "los profesionales financieros son en su mayoría malos"), ("las finanzas impactan positivamente en nuestro mundo", "las finanzas impactan negativamente nuestro mundo "), ("el sistema financiero ayuda a la economía","el sistema financiero perjudica a la economía"), ("los servicios financieros benefician a la sociedad","los servicios financieros perjudican a la sociedad")],
             [("finance is good for society","finance is bad for society"), ("finance professionals are mostly good people","finance professionals are mostly corrupt people"), ("finance positively impacts our world","finance negatively impacts our world"), ("financial system helps the economy","financial system hurts the economy"), ("financial services benefit society","financial services damage society")],
             [("finance is good for society","finance is bad for society"), ("finance professionals are mostly good people","finance professionals are mostly corrupt people"), ("finance positively impacts our world","finance negatively impacts our world"), ("financial system helps the economy","financial system hurts the economy"), ("financial services benefit society","financial services damage society")],
             [("Финансовые услуги по пользу обществу","Финансовые услуги Повреждение общества"), ("финансы полезны для общества","финансы вредны для общества"), ("профессионалы в области финансов в основном хороши","профессионалы в области финансов в основном злые"), ("финансы положительно влияют на наш мир","финансы негативно влияют наш мир"), ("финансовая система помогает экономике","финансовая система наносит ущерб экономике")], ## new
             [("金融对社会好","金融对社会不好"), ("财务专业人员大多很好","财务专业人员大多邪恶"), ("金融对世界产生积极影响", "金融对世界产生消极影响"), ("金融系统帮助经济", "金融系统有害金融"), ("金融服务有益社会", "金融服务损害社会")],
              ]

########################### 2. Create functions
def get_embed(text_input):
    
    text = text_input.lower()
    marked_text = "[CLS] " + text + " [SEP]" # bert requirement
    tokenized_text = tokenizer.tokenize(marked_text)
    print(tokenized_text)
    
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text) # bert requirement
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Predict hidden states features for each layer
    with torch.no_grad(): #deactivates the gradient calculations, saves memory, and speeds up computation 
        encoded_layers, _ = model(tokens_tensor, segments_tensors) #here you feed the sentence
    
    # Convert the hidden state embeddings into single token vectors
    token_embeddings = []
    layer_i, batch_i, token_i = 0, 0, 0
    
    for token_i in range(len(tokenized_text)): 
      # Holds 12 layers of hidden states for each token 
      hidden_layers = [] 
      
      # For each of the 12 layers...
      for layer_i in range(len(encoded_layers)):
        
        # Lookup the vector for `token_i` in `layer_i`
        vec = encoded_layers[layer_i][batch_i][token_i]
        hidden_layers.append(vec)
        
      token_embeddings.append(hidden_layers)

    # Stores the token vectors, with shape [number of words x 768]
    token_vecs_sum = []
    
    # For each token in the sentence...
    for token in token_embeddings:
        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(torch.stack(token)[-4:], 0) #we only need last 4
        
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

    # Get Embeddings 
    cls_embed = token_vecs_sum[0] # CLS is used for sentence
    return cls_embed


def get_cosine(embed_var, embed_dim):
    try:
        return cosine_similarity(embed_var.reshape(1,-1), embed_dim.reshape(1,-1))[0][0]
    except:
        return "NA"

######################## Working
lang = langlist[lnum]
tokenizer = pickle.load(open(bertdir + tokenlist[lnum], "rb" ))
model = pickle.load(open(bertdir + modellist[lnum], "rb" ))
model.eval() 

fin_ngrams = pickle.load(open(pdir + lang + str(yr) + '_sample.p', "rb" )) 
fin_ngrams['embed'] = fin_ngrams.apply(lambda row: get_embed(row.ngrams), axis=1)
fin_ngrams = fin_ngrams[fin_ngrams['embed'] != 'NA'] 

### good bad embedding
gblist = gblistall[lnum]
gb = torch.zeros(768) #create empty tensor, size 768 

for pair in gblist:
    gb = gb + get_embed(pair[0]) - get_embed(pair[1])   
print(gb[1:5])

fin_ngrams['cos'] = fin_ngrams.apply(lambda row: get_cosine(row.embed,gb), axis=1)


#get weighted average
fin_ngrams['year'] = yr
fin_ngrams['lang'] = lang
fin_ngrams['coscounts'] = fin_ngrams['cos']*fin_ngrams['counts']
fin_ngrams['tot_coscounts'] = fin_ngrams.groupby('year')['coscounts'].transform('sum')
fin_ngrams['tot_counts'] = fin_ngrams.groupby('year')['counts'].transform('sum')
fin_ngrams['score'] = fin_ngrams['tot_coscounts']/fin_ngrams['tot_counts']

# save score
fin_ngrams.drop_duplicates(subset="year", keep='first', inplace=True)
fin_ngrams = fin_ngrams[['year', 'lang', 'score']]
print(fin_ngrams)
fin_ngrams.to_csv(pdir + lang + str(yr) + '_sample.csv')


    
##################### download BERT models, and tokenizers
#from pytorch_pretrained_bert import BertTokenizer, BertModel
#model = BertModel.from_pretrained('bert-base-chinese')  # 'bert-base-multilingual-cased' ('bert-base-uncased')
#pickle.dump(model, open(bertdir + "bert_chi.p", "wb" )) #"bert_multi.p"
#tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') # 'bert-base-multilingual-cased', do_lower_case=False  ('bert-base-uncased')
#pickle.dump(tokenizer, open(bertdir + "token_chi.p", "wb" ))   
    
    
    
    
    
    
    
    
    
    











